library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
asexual_screen_data <- read_csv("otherdata/asexualscreendata2.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## current_version_ID = col_character(),
## gene = col_character(),
## cloneid = col_character(),
## experiments = col_character(),
## gene_name = col_character(),
## gene_product = col_character(),
## PfID = col_character(),
## strand = col_character(),
## call1 = col_logical(),
## call0 = col_logical(),
## phenotype = col_character(),
## type = col_character()
## )
## i Use `spec()` for the full column specifications.
analysedindetail <- c("PBANKA_010240", "PBANKA_082800", "PBANKA_041340", "PBANKA_090240", "PBANKA_141810", "PBANKA_071650", "PBANKA_143520")
loadAndCollapse <- function(file) {
counts <- read.csv(file, header = T, stringsAsFactors = FALSE)
counts <- counts[counts$barcode != "no_match", ]
fcounts <- as.matrix(counts[grep("\\.1$", names(counts))]) # forward
rcounts <- as.matrix(counts[grep("\\.2$", names(counts))]) # reverse
alltcounts <- fcounts + rcounts
colnames(alltcounts) <- gsub("\\.1", "", colnames(alltcounts))
rownames(alltcounts) <- counts$gene
alltcounts <- as.data.frame(t(alltcounts))
alltcounts$Index <- 1:nrow(alltcounts)
alltcounts$file <- file
row.names(alltcounts) <- NULL
return(alltcounts)
}
metadata <- read_csv("SampleAssignments.csv")
## Warning: Missing column names filled in: 'X8' [8], 'X9' [9]
##
## -- Column specification --------------------------------------------------------
## cols(
## Run = col_character(),
## Index = col_double(),
## Description = col_character(),
## Pool = col_character(),
## Condition = col_character(),
## Mouse = col_character(),
## Replicate = col_double(),
## X8 = col_logical(),
## X9 = col_character()
## )
files <- unique(metadata$Run)
files <- files[!is.na(files)]
files <- paste0(files, ".csv")
dfs <- lapply(files, loadAndCollapse)
bigdf <- plyr::rbind.fill(dfs)
bigdf$Run <- gsub(".csv", "", bigdf$file)
bigdf <- merge(bigdf, metadata, by = c("Index", "Run"), all.x = T)
bigdf <- bigdf %>% filter(Condition != "DoNotAnalyse")
bigdf$Run
## [1] "PbSTM101" "replicateA" "replicateB" "PbSTM101" "replicateA"
## [6] "replicateB" "counts_24160" "PbSTM101" "replicateA" "replicateB"
## [11] "counts_24160" "replicateA" "replicateB" "replicateA" "replicateB"
## [16] "counts_24160" "replicateA" "replicateB" "replicateA" "replicateB"
## [21] "replicateA" "replicateB" "replicateA" "replicateB" "counts_24160"
## [26] "replicateA" "replicateB" "counts_24160" "replicateA" "replicateB"
## [31] "counts_24160" "replicateA" "replicateB" "counts_24160" "PbSTM101"
## [36] "replicateA" "replicateB" "counts_24160" "PbSTM101" "replicateA"
## [41] "replicateB" "counts_24160" "PbSTM101" "replicateA" "replicateB"
## [46] "counts_24160" "counts_24160" "counts_24160" "counts_24160" "counts_24160"
## [51] "counts_24160" "counts_24160" "PbSTM101" "counts_24160" "PbSTM101"
## [56] "PbSTM101" "counts_24160" "PbSTM101" "PbSTM101" "PbSTM101"
## [61] "PbSTM101" "PbSTM101" "PbSTM101" "PbSTM101" "PbSTM101"
## [66] "PbSTM101" "PbSTM101" "PbSTM101" "PbSTM101" "PbSTM101"
## [71] "PbSTM101" "PbSTM101"
bigdf$Index <- NULL
bigdf$file <- NULL
bigdf$Description <- NULL
bigdf$Mouse <- as.factor(bigdf$Mouse)
narrow <- melt(bigdf, id.vars = c("Pool", "Condition", "Mouse", "Replicate", "Run"), variable.name = "gene")
narrow <- narrow %>% filter(!is.na(Pool))
narrow$value <- as.numeric(as.character(narrow$value))
narrow$value[is.na(narrow$value)] <- 0
narrow <- narrow %>%
group_by(Pool, Condition, Mouse, Replicate) %>%
mutate(proportion = value / sum(value))
wellrepresentedinpassage <- narrow %>%
filter(Condition == "control" || Condition == "Negsort") %>%
group_by(gene, Pool) %>%
filter(max(proportion) > 0.0001) %>%
summarise(toinclude = T)
## `summarise()` has grouped output by 'gene'. You can override using the `.groups` argument.
narrow <- narrow %>%
left_join(wellrepresentedinpassage) %>%
filter(toinclude == T)
## Joining, by = c("Pool", "gene")
narrow <- narrow %>% mutate(value = value + 0.5)
narrow <- narrow %>%
group_by(Pool, Condition, Mouse, Replicate) %>%
mutate(proportion = (value) / sum(value))
narrow <- narrow %>% mutate(logprop = log2(proportion))
totals <- narrow %>%
group_by(Pool, Condition, Mouse, Replicate, Run) %>%
summarise(sum = sum(value))
## `summarise()` has grouped output by 'Pool', 'Condition', 'Mouse', 'Replicate'. You can override using the `.groups` argument.
Plot out how many total barcode reads we have from each condition in each sequencing run to check for relatively even coverage:
ggplot(totals, aes(x = paste(Pool, Condition, Mouse, Replicate), fill = Pool, y = sum)) +
geom_bar(stat = "identity") +
facet_wrap(~Run, scales = "free", ncol = 3) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 2))
## Examining the success of the super-pooling strategy
We observed that in the previous asexual screen (Bushell, Gomes, Sanderson, et al.), the integration efficiency of a vector was proportional to its geometric mean homology arm length. There were also reproducible elements of integration efficiency, which may have reflected epigenetic factors. As a result, to construct our superpool we matched vectors targeting dispensable genes according to their normalised abundance. And we matched vectors targeting slow genes according to their growth rate and normalised abundance.
Here we analyse whether this matching resulted in a superpool with more even vector abundance than would be expected in unmatched transfections.
proportions <- narrow %>%
filter(Pool == "SP3", Condition == "control") %>%
group_by(gene) %>%
summarise(proportion = mean(proportion))
asex <- asexual_screen_data %>% filter(phenotype %in% c("Slow", "Fast", "Dispensable"))
merge <- inner_join(proportions, asex, by = "gene")
ggplot(merge, aes(x = proportion, y = normd6toinputA)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
## Warning: Removed 1 rows containing missing values (geom_point).
merge$normd6toinputA <- log10(merge$normd7toinputA)
merge$proportion <- log10(merge$proportion)
merge$normd6toinputA <- merge$normd6toinputA - mean(merge$normd6toinputA)
merge$proportion <- merge$proportion - mean(merge$proportion)
narrower <- merge %>%
dplyr::select(gene, proportion, normd6toinputA) %>%
gather("key", "value", -gene)
ggplot(narrower, aes(x = value, fill = key)) +
geom_density(alpha = 0.5)
a <- ggplot(merge, aes(x = Relative.Growth.Rate, y = proportion)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm") +
labs(y = "Relative proportion (powers\n of 10) in superpool", x = "Relative growth rate")
b <- ggplot(merge, aes(x = Relative.Growth.Rate, y = normd6toinputA)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm") +
labs(y = "Relative proportion (powers\n of 10) in asexual screen", x = "Relative growth rate")
c <- ggplot(merge, aes(x = sqrt(left_arm_length^2 + right_arm_length^2), y = proportion)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm") +
scale_x_log10() +
labs(y = "Relative proportion (powers\n of 10) in superpool", x = "Geometric mean homology arm length")
d <- ggplot(merge, aes(x = sqrt(left_arm_length^2 + right_arm_length^2), y = normd6toinputA)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm") +
scale_x_log10() +
labs(y = "Relative proportion (powers\n of 10) in asexual screen", x = "Geometric mean homology arm length")
library(patchwork)
f <- b + a + d + c + plot_layout(ncol = 2)
print(f)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggsave("output/superpool.pdf", f)
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
It is clear that relative proportion is much more independent of relative growth rate, and of geometric mean homology arm length, indicating the success of the matched-transfection approach.
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
variance <- narrow %>%
group_by(Pool, Condition, Mouse, gene) %>%
summarise(sd = sd(logprop), val1 = logprop[1], val2 = logprop[2], meanlogprop = mean(logprop))
## `summarise()` has grouped output by 'Pool', 'Condition', 'Mouse'. You can override using the `.groups` argument.
cors <- variance %>%
group_by(Condition, Mouse, Pool) %>%
summarise(cor = cor(val1, val2), n = n())
## `summarise()` has grouped output by 'Condition', 'Mouse'. You can override using the `.groups` argument.
ggplot(variance, aes(x = val1, y = val2, color = Condition)) +
geom_rect(data = cors, xmin = -50, xmax = 50, ymin = -50, ymax = 50, aes(fill = -cor, x = NA, y = NA, group = paste(Pool, Condition, Mouse))) +
geom_point(alpha = 0.4, size = 0.1) +
facet_grid(Pool ~ Condition + Mouse) +
scale_fill_distiller(palette = "Greys") +
theme(panel.background = element_rect(fill = "white")) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
scale_color_brewer(palette = "Set1") +
theme(
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()
)
## Warning: Ignoring unknown aesthetics: x, y
variance$Condition <- as.character(variance$Condition)
variance <- variance %>%
ungroup() %>%
mutate(Condition = ifelse(Pool == "SP3" & Condition == "control", "Negsort", Condition))
variance <- variance %>%
group_by(Pool, Condition, Mouse) %>%
arrange(-meanlogprop) %>%
mutate(medsd = rollmean(x = sd, k = 11, fill = "extend"))
variance <- variance %>% mutate(monotonicmedsd = cummax(medsd))
noncontrol <- variance %>% filter(Condition != "Negsort")
control <- variance %>% filter(Condition == "Negsort")
comparison <- inner_join(ungroup(control), ungroup(noncontrol), by = c("Mouse", "gene", "Pool"))
comparison <- comparison %>% mutate(difference = meanlogprop.y - meanlogprop.x)
comparison <- comparison %>% mutate(differencesd = sqrt(monotonicmedsd.y^2 + monotonicmedsd.x^2))
comparison <- comparison %>% mutate(Condition = Condition.y)
gaussianMeanAndVarianceSplit <- function(vals, variances, return) {
df <- data.frame(value = vals, variance = variances)
df <- df[complete.cases(df), ]
vals <- df$value
variances <- df$variance
if (length(vals) == 1) {
var <- variances[1]
mean <- vals[1]
}
else {
precs <- 1 / variances
mean <- sum(vals * precs) / sum(precs)
var1 <- (1 / sum(precs)) * (1 / (length(vals) - 1)) * sum((vals - mean)**2 / variances)
var2 <- 1 / sum(precs)
if (is.na(var1)) {
var1 <- 0
}
var <- max(var1, var2)
}
if (return == "mean") {
return(mean)
}
if (return == "variance") {
return(var)
}
}
narrow %>%
filter(Condition %in% c("control", "Negsort", "GFPsort", "RFPsort")) %>%
group_by(gene) %>%
summarise(min = min(proportion), count = n()) %>%
arrange(-count, -min)
# Is PBANKA_071830 a good control? Same for PBANKA_120780, PBANKA_083110, PBANKA_010620
controls <- c("PBANKA_071830", "PBANKA_120780", "PBANKA_083110", "PBANKA_051060", "PBANKA_051820", "PBANKA_132610")
controlsset <- filter(comparison, gene %in% controls)
write_csv(controlsset, "tempabc.csv")
# ggplot(comparisonmerge,aes(y=difference,ymin=difference-2*differencesd,ymax=difference+2*differencesd,x=1))+geom_point()+facet_wrap(~Condition+Pool)+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+geom_errorbar()
controlsmerge <- controlsset %>%
group_by(Condition, Pool) %>%
summarise(difference = gaussianMeanAndVarianceSplit(difference, differencesd^2, "mean"), differencesd = sqrt(gaussianMeanAndVarianceSplit(difference, differencesd^2, "variance")))
## `summarise()` has grouped output by 'Condition'. You can override using the `.groups` argument.
comparisonnormalised <- inner_join(comparison, controlsmerge, by = c("Pool", "Condition"), suffix = c("", "control")) %>% mutate(difference = difference - differencecontrol, differencesd = sqrt(differencesd^2 + differencesdcontrol^2))
comparisonmerge <- comparisonnormalised %>%
group_by(Condition, gene) %>%
summarise(difference = gaussianMeanAndVarianceSplit(difference, differencesd^2, "mean"), differencesd = sqrt(gaussianMeanAndVarianceSplit(difference, differencesd^2, "variance")))
## `summarise()` has grouped output by 'Condition'. You can override using the `.groups` argument.
comparisonmerge <- comparisonmerge %>% mutate(p = 1 - pnorm(0, mean = difference, sd = differencesd))
comparisonmerge <- comparisonmerge %>% mutate(diffmax = difference + 2 * differencesd)
comparisonmerge <- comparisonmerge %>% mutate(diffmin = difference - 2 * differencesd)
comparisonmerge <- comparisonmerge %>% mutate(power = ifelse(diffmin > -1, "notreduced", ifelse(diffmax < (-1), "reduced", "nopower")))
GFP <- filter(comparisonmerge, Condition == "GFPsort")
GFP$gene <- as.character(GFP$gene)
RFP <- filter(comparisonmerge, Condition == "RFPsort")
RFP$gene <- as.character(RFP$gene)
# joint=full_join(GFP,RFP,by=c("gene","Pool"))
joint <- full_join(GFP, RFP, by = c("gene"), suffix = c(".gfp", ".rfp"))
joint$minmax <- pmin(joint$diffmax.gfp, joint$diffmax.rfp)
joint <- mutate(joint,
class = case_when(
diffmax.gfp < -1 & diffmax.rfp < -1 ~ "lossOfBoth",
diffmax.gfp < -1 ~ "lossOfMales",
diffmax.rfp < -1 ~ "lossOfFemales",
diffmin.rfp > 1 & diffmin.gfp > 1 ~ "gainOfBoth",
diffmin.rfp > 1 ~ "gainOfFemales",
diffmin.gfp > 1 ~ "gainOfMales",
diffmax.gfp > -1 & diffmax.rfp > -1 ~ "No change",
TRUE ~ "Unsure"
)
)
library(tidyverse)
counts <- read_tsv("./ap2gtimecourse/counts.txt", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## i Use `spec()` for the full column specifications.
phenodata <- read_tsv("./ap2gtimecourse/phenodata.txt", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_character(),
## sample = col_character(),
## file = col_character(),
## rapamycin = col_character(),
## time = col_character(),
## experiment = col_double(),
## colorw = col_character(),
## shape = col_double(),
## amplified = col_double(),
## Strain = col_character(),
## timew = col_character(),
## N = col_double()
## )
phenodata$X1 <- gsub("X820", "820", phenodata$X1)
narrow <- counts %>% gather("sample", "count", -X1)
colnames(narrow) <- c("gene", "X1", "count")
narrow <- narrow %>% filter(count > 0)
both <- inner_join(phenodata, narrow)
## Joining, by = "X1"
both <- both %>%
group_by(X1) %>%
mutate(proportion = count / sum(count))
counts2 <- as.matrix(select(counts, -X1))
rownames(counts2) <- counts$X1
timepoints <- unique(phenodata$time)
i <- 9
cols <- colnames(counts2)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
performDE <- function(filter) {
abc <- tibble()
filter2 <- filter %>% filter(Strain == "AP2Goe")
print(unique(filter$time))
if (length(unique(filter2$rapamycin)) > 1) {
counttable <- counts2[, base::match(filter2$X1, cols)]
dds <- DESeqDataSetFromMatrix(countData = counttable,
colData = filter2,
design= ~ rapamycin)
dds <- DESeq(dds)
res <- results(dds)
abc <- as.tibble(res)
abc$id = rownames(results(dds))
abc$timepoint <- unique(filter$time)
}
return(abc)
}
new <- phenodata %>%
group_by(time) %>%
filter(time != "3h") %>%
do(performDE(.))
## [1] "0h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## [1] "12h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "18h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "1h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "24h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "2h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "30h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "44h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "4h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "6h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "8h"
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "gams"
## [1] "schizont"
oneversion <- joint
asexual <- read_csv("./otherdata/asexualscreendata.csv", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## current_version_ID = col_character(),
## gene = col_character(),
## cloneid = col_character(),
## experiments = col_character(),
## gene_name = col_character(),
## gene_product = col_character(),
## PfID = col_character(),
## strand = col_character(),
## call1 = col_logical(),
## call0 = col_logical(),
## phenotype = col_character(),
## type = col_character()
## )
## i Use `spec()` for the full column specifications.
merge <- right_join(oneversion, asexual)
## Joining, by = "gene"
new$id <- gsub(".1$", "", new$id)
new$time2 <- as.numeric(gsub("h", "", new$time))
merge2 <- inner_join(new, merge, by = c("id" = "current_version_ID"))
filt <- merge2 %>%
group_by(timepoint) %>%
arrange(pvalue) %>%
filter(row_number() < 50) %>%
mutate(n = row_number())
ggplot(filt, aes(y = -n, x = time2, fill = class)) +
geom_tile() +
geom_line(data = filter(filt, !is.na(class)), aes(group = id, color = class, alpha = 0.5))
filt <- merge2 %>%
group_by(timepoint) %>%
arrange(pvalue) %>%
mutate(n = row_number())
ggplot(filt %>% filter(class !="No change", n<50), aes(y = n, x = time2, fill = class,color=class, group=id)) + geom_point(data=filt %>% filter(class =="No change", n<50) ,color="gray",alpha=0.2) + geom_line(data=filt %>% filter(class =="No change", n<50) ,color="gray",alpha=0.2)+
geom_point() +geom_line()+theme_bw()+
labs(y = "Genes ranked at each timepoint by p-value", x = "Timepoint (hours)")
library(tidyverse)
counts <- read_tsv("./ap2gtimecourse/counts.txt", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## i Use `spec()` for the full column specifications.
phenodata <- read_tsv("./ap2gtimecourse/phenodata.txt", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_character(),
## sample = col_character(),
## file = col_character(),
## rapamycin = col_character(),
## time = col_character(),
## experiment = col_double(),
## colorw = col_character(),
## shape = col_double(),
## amplified = col_double(),
## Strain = col_character(),
## timew = col_character(),
## N = col_double()
## )
phenodata$X1 <- gsub("X820", "820", phenodata$X1)
narrow <- counts %>% gather("sample", "count", -X1)
colnames(narrow) <- c("gene", "X1", "count")
narrow <- narrow %>% filter(count > 0)
both <- inner_join(phenodata, narrow)
## Joining, by = "X1"
both <- both %>%
group_by(X1) %>%
mutate(proportion = count / sum(count))
both$current_version_ID <- gsub(".1$", "", both$gene)
oneversion <- joint
asexual <- read_csv("./otherdata/asexualscreendata.csv", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## current_version_ID = col_character(),
## gene = col_character(),
## cloneid = col_character(),
## experiments = col_character(),
## gene_name = col_character(),
## gene_product = col_character(),
## PfID = col_character(),
## strand = col_character(),
## call1 = col_logical(),
## call0 = col_logical(),
## phenotype = col_character(),
## type = col_character()
## )
## i Use `spec()` for the full column specifications.
merge <- inner_join(oneversion, asexual)
## Joining, by = "gene"
newmerge <- filter(merge, class %in% c("lossOfMales", "lossOfFemales", "lossOfBoth")) %>% mutate(transcript = paste0(current_version_ID, ".1"))
gois <- newmerge$current_version_ID
sextranscriptomes <- read_tsv("./otherdata/sextranscriptomes.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## GeneID = col_character(),
## asexual1 = col_double(),
## asexual2 = col_double(),
## asexual3 = col_double(),
## female1 = col_double(),
## female2 = col_double(),
## female3 = col_double(),
## male1 = col_double(),
## male2 = col_double(),
## male3 = col_double()
## )
sextranscriptomes <- sextranscriptomes %>%
gather("typerep", "count", -GeneID) %>%
separate(typerep, c("type", "rep"), sep = -2) %>%
filter(count > 0)
sextranscriptomes <- sextranscriptomes %>%
group_by(type, rep) %>%
mutate(proportion = count / sum(count))
sextranscriptomesaverage <- sextranscriptomes %>%
group_by(type, GeneID) %>%
summarise(proportion = mean(proportion)) %>%
spread(key = type, value = proportion) %>%
mutate(malratio = mal / asexua, femalratio = femal / asexua)
## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
sextranscriptomesaverage <- mutate(sextranscriptomesaverage,
sexexpclass = case_when(
malratio > 2 & femalratio > 2 ~ "bothExp",
malratio > 2 ~ "MaleExp",
femalratio > 2 ~ "FemaleExp",
TRUE ~ "Unsure"
)
)
ggplot(sextranscriptomes %>% filter(GeneID %in% gois), aes(x = type, y = proportion, color = type)) +
geom_point(alpha = 0.5) +
facet_wrap(~GeneID, scales = "free_y") +
expand_limits(y = 0)
combine <- inner_join(sextranscriptomesaverage, merge, by = c("GeneID" = "current_version_ID"))
table(combine$sexexpclass, combine$class)
##
## gainOfFemales gainOfMales lossOfBoth lossOfFemales lossOfMales
## bothExp 0 0 2 1 1
## FemaleExp 1 1 2 2 1
## MaleExp 0 1 9 2 5
## Unsure 0 1 25 16 12
##
## No change
## bothExp 182
## FemaleExp 90
## MaleExp 191
## Unsure 481
#Compare to curated data and plot ROC curves
curated <- read_csv("otherdata/curated.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Gene = col_character(),
## Description = col_character(),
## comb = col_character(),
## Overall = col_character(),
## `Male formation` = col_character(),
## `Male activation` = col_character(),
## `Female formation` = col_character(),
## `Female fertility` = col_character(),
## Other = col_character()
## )
table(curated$Overall)
##
## Difference No difference
## 66 235
merge <- inner_join(oneversion, asexual)
## Joining, by = "gene"
newmerge <- inner_join(merge, curated, by = c("current_version_ID" = "Gene"))
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:IRanges':
##
## cov, var
## The following objects are masked from 'package:S4Vectors':
##
## cov, var
## The following object is masked from 'package:BiocGenerics':
##
## var
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
overall <- roc(newmerge$Overall == "Difference", newmerge$minmax)
## Setting levels: control = FALSE, case = TRUE
## Setting direction: controls > cases
male <- roc(newmerge$`Male formation` %in% c("Reduced", "No"), newmerge$diffmax.gfp)
## Setting levels: control = FALSE, case = TRUE
## Setting direction: controls > cases
female <- roc(newmerge$`Female formation` %in% c("Reduced", "No"), newmerge$diffmax.rfp)
## Setting levels: control = FALSE, case = TRUE
## Setting direction: controls > cases
overall
##
## Call:
## roc.default(response = newmerge$Overall == "Difference", predictor = newmerge$minmax)
##
## Data: newmerge$minmax in 125 controls (newmerge$Overall == "Difference" FALSE) > 32 cases (newmerge$Overall == "Difference" TRUE).
## Area under the curve: 0.6565
male
##
## Call:
## roc.default(response = newmerge$`Male formation` %in% c("Reduced", "No"), predictor = newmerge$diffmax.gfp)
##
## Data: newmerge$diffmax.gfp in 150 controls (newmerge$`Male formation` %in% c("Reduced", "No") FALSE) > 7 cases (newmerge$`Male formation` %in% c("Reduced", "No") TRUE).
## Area under the curve: 0.7324
female
##
## Call:
## roc.default(response = newmerge$`Female formation` %in% c("Reduced", "No"), predictor = newmerge$diffmax.rfp)
##
## Data: newmerge$diffmax.rfp in 149 controls (newmerge$`Female formation` %in% c("Reduced", "No") FALSE) > 8 cases (newmerge$`Female formation` %in% c("Reduced", "No") TRUE).
## Area under the curve: 0.7517
plot(male)
plot(female)
library(precrec)
##
## Attaching package: 'precrec'
## The following object is masked from 'package:pROC':
##
## auc
male <- evalmod(scores = -newmerge$diffmax.gfp, labels = newmerge$`Male formation` %in% c("Reduced", "No"))
female <- evalmod(scores = -newmerge$diffmax.rfp, labels = newmerge$`Female formation` %in% c("Reduced", "No"))
set.seed(283)
data <- read.csv("./otherdata/ExpressionPatternOttoEtAl.csv", header = T, stringsAsFactors = FALSE)
merge <- inner_join(oneversion, asexual)
## Joining, by = "gene"
m <- inner_join(merge, data, by = c("gene" = "Gene"))
df <- m[m$Total > 0, c("Ring", "Tro", "Sch", "Gam", "Ook")]
m$Ring <- as.numeric(m$Ring)
## Warning: NAs introduced by coercion
m$Tro <- as.numeric(m$Tro)
## Warning: NAs introduced by coercion
m$Sch <- as.numeric(m$Sch)
## Warning: NAs introduced by coercion
m$Gam <- as.numeric(m$Gam)
## Warning: NAs introduced by coercion
m$Ook <- as.numeric(m$Ook)
## Warning: NAs introduced by coercion
gathered <- gather(m, key = "stage", value = "expression", Ring, Tro, Sch, Gam, Ook)
gathered$stage <- factor(as.character(gathered$stage), levels = c("Ring", "Tro", "Sch", "Gam", "Ook"))
ggplot(filter(gathered, class != "gainOfFemales" & class != "gainOfMales"), aes(x = stage, y = expression)) +
stat_summary(aes(y = expression, group = 1), fun.y = mean, colour = "red", geom = "line", size = 1) +
facet_wrap(~class) +
expand_limits(y = 0)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 5 rows containing non-finite values (stat_summary).
merge <- inner_join(oneversion, asexual)
## Joining, by = "gene"
interpro <- read.table("otherdata/interpro.txt", header = T, sep = "\t")
interpro <- unique(interpro[, c("Domain", "GeneNew")])
members <- read.csv("otherdata/InterProToMembers.csv")
joiner <- inner_join(interpro, members)
## Joining, by = "Domain"
interpro <- unique(joiner[, c("interpro", "GeneNew")])
m <- inner_join(merge, interpro, by = c("PfID" = "GeneNew"))
m2 <- m
m2$gene <- as.character(m$gene)
tallied <- m2 %>%
group_by(interpro, class) %>%
summarise(n = n()) %>%
spread(class, n, fill = 0)
## `summarise()` has grouped output by 'interpro'. You can override using the `.groups` argument.
totals <- m2 %>% group_by(class) %>% summarise(n=n())
tallied <- tallied %>%
ungroup() %>%
mutate(
p1 = phyper(lossOfMales - 1, filter(totals, class == "lossOfMales")$n, filter(totals, class != "lossOfMales")$n, lossOfBoth + gainOfMales + lossOfFemales, lower.tail = F),
p2 = phyper(lossOfFemales - 1, filter(totals, class == "lossOfFemales")$n, filter(totals, class != "lossOfFemales")$n, lossOfBoth + gainOfMales + lossOfMales, lower.tail = F)
)
# entrylist=read_tsv("entry.list")
# tallied<-inner_join(tallied,entrylist,by=c("interpro"="ENTRY_AC"))
# MostEnrichedViable<-head(arrange(tallied,p1))
# MostEnrichedViable
# MostEnrichedNonViable<-tail(arrange(tallied,-p2))
# MostEnrichedNonViable
# write_csv(tallied,"InterProEnrichment.csv")
# This was an attempt to use Charlie's mass spec interactome but did not yield useful data
edges <- read_tsv("/Users/theo/Desktop/adamsinteractome/edges.txt")
merge <- inner_join(oneversion, asexual)
hits <- filter(merge, minmax < 0)
filt <- filter(edges, source %in% hits$PfID, target %in% hits$PfID)
write_csv(abc, "filt2.csv")
abc <- inner_join(filt, merge, by = c("source" = "PfID"))
abc <- inner_join(abc, merge, by = c("target" = "PfID"))
ggplot(abc, aes(x = diffmax.rfp.x, y = diffmax.rfp.y)) +
geom_point()
ggplot(abc, aes(x = diffmax.gfp.x, y = diffmax.gfp.y)) +
geom_point()
# This is about Poran et al data
oneversion <- joint
load("/users/theo/desktop/All_merged_forWeb.rdata")
asexual <- read_csv("/Users/theo/gametocytes/gametocyteBarseq/otherdata/asexualscreendata.csv", col_names = T)
merge <- inner_join(oneversion, asexual)
library(Seurat)
pbmc <- CreateSeuratObject(
raw.data = all_merge, min.cells = 3, min.genes = 200,
project = "10X_PBMC"
)
pbmc <- FilterCells(
object = pbmc, subset.names = c("nGene"),
low.thresholds = c(200), high.thresholds = c(2500)
)
pbmc <- NormalizeData(
object = pbmc, normalization.method = "LogNormalize",
scale.factor = 10000
)
pbmc <- FindVariableGenes(
object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5
)
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI"))
pbmc <- RunPCA(
object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5,
genes.print = 5
)
PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)
# pbmc <- JackStraw(object = pbmc, num.replicate = 100, do.print = FALSE)
# JackStrawPlot(object = pbmc, PCs = 1:12)
pbmc <- RunTSNE(object = pbmc, dims.use = 1:17, do.fast = TRUE, perplexity = 50)
TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE)
projected <- ProjectPCA(pbmc,
do.print = TRUE, pcs.print = 1:5, pcs.store = 17,
genes.print = 30, replace.pc = FALSE, do.center = FALSE
)
library(tidyverse)
genes <- rownames(pbmc@data)
cells <- colnames(pbmc@data)
newmat <- as.tibble(as.matrix(t(pbmc@data)))
newmat$cell <- cells
newmat$tsne1 <- pbmc@dr$tsne@cell.embeddings[, 1]
newmat$tsne2 <- pbmc@dr$tsne@cell.embeddings[, 2]
library(viridis)
ggplot(arrange(newmat, PF3D7_0315600), aes(x = tsne1, y = tsne2, color = log(PF3D7_0315600))) +
geom_point(size = 1, alpha = 0.5) +
scale_color_viridis()
# answer<-projected@dr$pca@cell.embeddings %*% t(projected@dr$pca@gene.loadings.full)
tib <- as.tibble(answer)
print("hi")
new <- merge %>%
arrange(minmax) %>%
filter(row_number() < 80) %>%
filter(PfID != "#N/A") %>%
filter(phenotype != "Essential") %>%
filter(phenotype != "Slow")
# Use purr
library(purrr)
library(rlang)
makePlot <- function(x, y) {
p <- ggplot(arrange(newmat, !!sym(x)), aes(x = tsne1, y = tsne2, color = log(!!sym(x)))) +
geom_point(size = 1, alpha = 0.5) +
scale_color_viridis() +
ggtitle(y) +
guides(color = FALSE)
return(p)
}
plots <- map2(new$PfID, new$gene, makePlot)
library(patchwork)
wrap_plots(plots)
ggsave("multiplot.png", width = 18, height = 18, )
asexual <- read_csv("otherdata/asexualscreendata.csv", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## current_version_ID = col_character(),
## gene = col_character(),
## cloneid = col_character(),
## experiments = col_character(),
## gene_name = col_character(),
## gene_product = col_character(),
## PfID = col_character(),
## strand = col_character(),
## call1 = col_logical(),
## call0 = col_logical(),
## phenotype = col_character(),
## type = col_character()
## )
## i Use `spec()` for the full column specifications.
merge <- inner_join(oneversion, asexual)
## Joining, by = "gene"
orthologs <- read_tsv("otherdata/orthologs.txt", col_names = F)
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_logical()
## )
orthomcl <- read_tsv("otherdata/allorthomcl.txt", col_names = T)
## Warning: Missing column names filled in: 'X4' [4]
##
## -- Column specification --------------------------------------------------------
## cols(
## `[Accession]` = col_character(),
## `[Taxon Code]` = col_character(),
## `[Group ID]` = col_character(),
## X4 = col_logical()
## )
colnames(orthomcl) <- c("Acc", "Tax", "Group", "Blank")
taxa <- c("pber", "pviv", "pfal", "bbov", "tpar", "tgon", "ncan", "chom", "cpar", "crei", "scer", "hsap", "ecol")
tally <- orthomcl %>%
group_by(Tax, Group) %>%
summarise(n = n()) %>%
filter(Tax %in% taxa)
## `summarise()` has grouped output by 'Tax'. You can override using the `.groups` argument.
tally$Tax <- factor(as.character(tally$Tax), levels = taxa)
new <- merge %>%
arrange(minmax) %>%
mutate(texter = paste(gene, class)) %>%
filter(row_number() < 70)
new$gene <- factor(as.character(new$gene), levels = as.character(new$gene))
new$texter <- factor(as.character(new$texter), levels = as.character(new$texter))
comb <- inner_join(new, orthologs, by = c("current_version_ID" = "X1"))
comb <- inner_join(comb, tally, by = c("X3" = "Group"))
ggplot(filter(comb, gene %in% analysedindetail), aes(x = Tax, y = texter, fill = n)) +
geom_tile() +
scale_fill_viridis_c()
asexual <- read_csv("otherdata/asexualscreendata.csv", col_names = T)
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## current_version_ID = col_character(),
## gene = col_character(),
## cloneid = col_character(),
## experiments = col_character(),
## gene_name = col_character(),
## gene_product = col_character(),
## PfID = col_character(),
## strand = col_character(),
## call1 = col_logical(),
## call0 = col_logical(),
## phenotype = col_character(),
## type = col_character()
## )
## i Use `spec()` for the full column specifications.
merge <- inner_join(oneversion, asexual)
## Joining, by = "gene"
new <- merge %>%
arrange(minmax) %>%
mutate(texter = paste(gene, class)) %>%
filter(row_number() < 200)
painters3 <- read_csv("otherdata/paintergams3.csv")
## Warning: Missing column names filled in: 'X27' [27], 'X28' [28]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## `Times previously identified` = col_double(),
## NEW = col_logical(),
## X27 = col_logical(),
## X28 = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 183 parsing failures.
## row col expected actual file
## 3793 NEW 1/0/T/F/TRUE/FALSE * 'otherdata/paintergams3.csv'
## 3794 NEW 1/0/T/F/TRUE/FALSE * 'otherdata/paintergams3.csv'
## 3795 NEW 1/0/T/F/TRUE/FALSE * 'otherdata/paintergams3.csv'
## 3796 NEW 1/0/T/F/TRUE/FALSE * 'otherdata/paintergams3.csv'
## 3797 NEW 1/0/T/F/TRUE/FALSE * 'otherdata/paintergams3.csv'
## .... ... .................. ...... ............................
## See problems(...) for more details.
both <- inner_join(new, painters3, by = c("PfID" = "GeneID"))
p1 <- ggplot(both, aes(label = texter, y = as.numeric(`3D7 Gam Specific Stabilization`), color = class, x = fct_reorder(texter, minmax))) +
geom_point()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(p1)
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
p2 <- ggplot(both, aes(label = texter, y = as.numeric(`3D7 Gam Specific Transcription`), color = class, x = fct_reorder(texter, minmax))) +
geom_point()
library(plotly)
ggplotly(p2)
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
painters2 <- read_csv("otherdata/paintergams2.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X4' [4], 'X5' [5],
## 'X6' [6], 'X7' [7], 'X8' [8], 'X9' [9], 'X10' [10], 'X12' [12], 'X13' [13],
## 'X14' [14], 'X15' [15], 'X16' [16], 'X17' [17], 'X18' [18], 'X20' [20],
## 'X21' [21]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character()
## )
## i Use `spec()` for the full column specifications.